{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import pandas as pd\n",
    "import matplotlib\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import jieba as jb\n",
    "import re\n",
    "from sklearn import utils\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import train_test_split\n",
    "from gensim.models.doc2vec import TaggedDocument\n",
    "import multiprocessing\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### EDA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "数据总量: 62774 .\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cat</th>\n",
       "      <th>review</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>46230</th>\n",
       "      <td>衣服</td>\n",
       "      <td>放洗衣机里洗破了</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18277</th>\n",
       "      <td>水果</td>\n",
       "      <td>这个苹果感觉是长熟的苹果，没有打蜡，不错，又甜又脆</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34140</th>\n",
       "      <td>洗发水</td>\n",
       "      <td>左手边开封的是超市买的，右手边未开封的京东买的，居然没有QS标志，详细的大家可以对比看......</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>55755</th>\n",
       "      <td>酒店</td>\n",
       "      <td>酒店比较老，设施一般，但是服务相当好，价格也比较合理。值得推荐。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26964</th>\n",
       "      <td>洗发水</td>\n",
       "      <td>物流真给力，京东的物流一直都很棒，东西也很好，满意</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9599</th>\n",
       "      <td>平板</td>\n",
       "      <td>连接无线平板会卡，质量太差</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26055</th>\n",
       "      <td>水果</td>\n",
       "      <td>潘苹果也不甜，这次生鲜水果购货体验不好</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17148</th>\n",
       "      <td>水果</td>\n",
       "      <td>新鲜水果，速度很快，已经来过很多次了，打折有优惠</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22542</th>\n",
       "      <td>水果</td>\n",
       "      <td>个头不大，水分还可。就是苦的。不好</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27544</th>\n",
       "      <td>洗发水</td>\n",
       "      <td>老公喜欢用清扬的，洗过头感觉很舒服，是正品哦，，包装也很好，京东值得你去信赖！</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       cat                                             review\n",
       "46230   衣服                                           放洗衣机里洗破了\n",
       "18277   水果                          这个苹果感觉是长熟的苹果，没有打蜡，不错，又甜又脆\n",
       "34140  洗发水  左手边开封的是超市买的，右手边未开封的京东买的，居然没有QS标志，详细的大家可以对比看......\n",
       "55755   酒店                   酒店比较老，设施一般，但是服务相当好，价格也比较合理。值得推荐。\n",
       "26964  洗发水                          物流真给力，京东的物流一直都很棒，东西也很好，满意\n",
       "9599    平板                                      连接无线平板会卡，质量太差\n",
       "26055   水果                                潘苹果也不甜，这次生鲜水果购货体验不好\n",
       "17148   水果                           新鲜水果，速度很快，已经来过很多次了，打折有优惠\n",
       "22542   水果                                  个头不大，水分还可。就是苦的。不好\n",
       "27544  洗发水            老公喜欢用清扬的，洗过头感觉很舒服，是正品哦，，包装也很好，京东值得你去信赖！"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('./data/shopping.csv')\n",
    "df=df[['cat','review']]\n",
    "print(\"数据总量: %d .\" % len(df))\n",
    "df.sample(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "在 cat 列中总共有 0 个空值.\n",
      "在 review 列中总共有 1 个空值.\n"
     ]
    }
   ],
   "source": [
    "print(\"在 cat 列中总共有 %d 个空值.\" % df['cat'].isnull().sum())\n",
    "print(\"在 review 列中总共有 %d 个空值.\" % df['review'].isnull().sum())\n",
    "df[df.isnull().values==True]\n",
    "df = df[pd.notnull(df['review'])]\n",
    "df = df.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   cat  count\n",
      "0   衣服  10000\n",
      "1   水果  10000\n",
      "2  洗发水  10000\n",
      "3   酒店  10000\n",
      "4   平板  10000\n",
      "5  计算机   3992\n",
      "6   书籍   3851\n",
      "7   手机   2323\n",
      "8   蒙牛   2033\n",
      "9  热水器    574\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Text(0.5,0,'类目')"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAgQAAAFnCAYAAADZilH/AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAIABJREFUeJzt3XmYnXV99/H3hwTCEhSQRYPsoKJQ\nFqMiggLCA1gVH9z3tdSlaqvWFRHFqo8rpa0LFi1WK1A3FAELRRQEqkGxVsAiGhRwAYREdgjf54/f\nPeYwmWRmksyce8j7dV255p7fOXPP98x1cs7n/LY7VYUkSVqzrTXsAiRJ0vAZCCRJkoFAkiQZCCRJ\nEgYCSZKEgUBa4yXJsGuQNHwGAmkNkOQRSV7aHT8zyae746cAH1+F8/5VknWTnJNk1yR/m+R+ST6Z\n5PGrqfyR3/Ww7uv2SQ6YwP1nJflOkm1WZx3SfdXsYRcgaVrcCrw3yRXAXcAdSdYHPgi8YfCOSe4C\nfryc8+wK7FJVV3TfzwaOBO4GNgCeU1UfSvLE7tzLSHIWcD/gtlE3rQcsrqqDxviZJwEfSrILUMDx\nSf6sqm5dwWM+BFi/qq5awX0kdQwE0n1ckjnAVcDrgX2Ay7ubHg2cA5yZZE5V3dG131ZV85dzroW0\nQEGSdYFraW/QGwP7Ad/tegbmVtUvRn7/wLkB7lxBucvclmQt4Gjg7dV2Uvtlkm8CxwJHDNzvscAX\ngZtpAeUhwG+SXDJ4Otrr3muq6twV1CGtcQwE0n3fqbQ37Lton8x3BX4DPA64BTgfmJPkwKq6iTat\nYMFyzvUg2pstwDrA42nBYj5wGfAH4FXAWt05tgb+mGT3qvpj93MLulpuH3XudYG1x/idbwIWVdWp\nA23vAC5K8hHgTdVcCGxLewDPBV5WVQcl+WfgdeP0JkhrvLh1sbRmSPJQ4F+AS4AlwG+BfYG/rqrL\nBu53U1VttJxzLAT2qaqrk6wNvLY7xwO6894A/Dnwjao6JslJwEeq6gdJnga8EVjEsmFgxNrAJl1N\nF3dDD/8KPK6qfjmqls2Bs4DFwCuq6mdd+7bA2cBBVfXLJFfShjlGD1FIGuCkQuk+LsljknwK+ASt\ni/07wJyqei/wfuDUJG9aiVM/GLgeeDNwDfAu4Fe0N/VHdvfZGrgSoKq+VlX70oLIlbShixu7f5d3\nbb+vqn27MLAhcDzwTuDCJIuSLE5ydZKrgYW0CZGX0wIOSR4EfBPYBvhqN1zwYFpvwiVJ/pDkySvx\nWKX7PIcMpPu+9YEvVdVZAEk2AuYCVNW5SR5NGwoYMWucIYMRmwB/A7yb9uHiG7ShgL2Bc7vfs2FV\n/WHUOfak9RDcCWzZtV1DG4JYZ+ROVfXHJDtX1Z3ACUk+CFxTVX/fPY4zgcuq6lPd97sBX6HNLXhL\nVe3etf8ceExV3Z7kX4DB+QySOgYC6b7vH4Bbkryr+35bYPMk53ffB5ib5I1VdTawZJxJhQB0n+L3\noc1B2As4GHhSVd2W5Cu0uQvnjHGa2cDXaF39+3Vt59LmNzxn8I5dGBjxBNrEyBFbAlcPfL+INvHw\n5CRvGav+kdOu4DZpjWUgkO7jqmqXkeMkmwLfAz4PXFtVR67i6ecBFwNnAg8DjuravwJ8ADhm8M7d\nJkhvBOZ0TTd2X0fe2N861i9Jchgwq6ouGmh+EK1nAYCqWkgbRoAxhkO71RYPYOmkSEkDDATSGiDJ\nLOBJtDfpN9HG2T+S5FvAu6vqgoG7rz3OkMHgzoY/B06mDRu8A/izJIfT9iZ4E22/gOdW1X91998e\neC9L9yAYGTLYeeB3/1dVLRqo/TDaXIGndd/fD9gFuH3UcsZBcwaOZ9PmNfw3rVfiv5fzM9IazUAg\n3Yd1a/g/S+tuPw942sCmQn+T5P8AxyXZGNi7qn7H+PsQzOmO16cFi0uAw6vqt0meD7yatkHR/ya5\nGDimW2Ewl7bK4XqWdtvfPOprgNOSPL2qfp/kWOBA4KlVdXF3n1cCLwLevoKHPm/geG3aa91Dququ\nFfyMtEZz2aF0H5fk4cDCFa3DT7Jt1+XeK91Kg9t9I5emnoFAkiS5D4EkSTIQSJIk1sBJhZtuumlt\nu+22wy5DkqRpcfHFF19fVZuNd781LhBsu+22LFiwvBVVkiTdtySZ0CXAHTKQJEkGAkmSZCCQJEkY\nCCRJEgYCSZKEgUCSJGEgkCRJGAgkSRIGAkmSxBADQZItkpzXHa+d5LQkFyR52aq2SZKkyRlKIEiy\nMXAisEHX9FpgQVXtDTy5uwb6qrRJkqRJGNa1DJYAzwZO7b7fD3hrd3wBMH8V2749+MuSHAEcAbD1\n1ltPuMht3/rNCd93MhZ+4M+n5LwAHH3/KTrvoik57a4n7jol5/3Ji38yJecFuOxhO0/JeXe+/LIp\nOe8/vfKcKTkvwGs+ecCUnVvS9BpKD0FVLa6qwXeYDYBruuPFwBar2Db69x1fVfOrav5mm417wSdJ\nktY4fZlUeDOwXnc8l1bXqrRJkqRJ6Mub58XAPt3xbsDCVWyTJEmTMKw5BKOdCJyeZF/g4cB/0YYB\nVrZNkiRNwlB7CKpqv+7rVcBBwPeAA6tqyaq0DeGhSJI0o/Wlh4CquhY4ZXW1SZKkievLHAJJkjRE\nBgJJkmQgkCRJBgJJkoSBQJIkYSCQJEkYCCRJEgYCSZKEgUCSJGEgkCRJGAgkSRIGAkmShIFAkiRh\nIJAkSRgIJEkSBgJJkoSBQJIkYSCQJEkYCCRJEgYCSZKEgUCSJGEgkCRJGAgkSRIGAkmShIFAkiRh\nIJAkSRgIJEkSBgJJkoSBQJIkYSCQJEkYCCRJEgYCSZKEgUCSJGEgkCRJGAgkSRIGAkmShIFAkiRh\nIJAkSRgIJEkSBgJJkoSBQJIkYSCQJEn0JBAk2TjJ6UnOS/LJru2EJBckOXLgfhNqkyRJk9OLQAC8\nEPh8Ve0LbJjkzcCsqtobmJdkpySHT6RteA9BkqSZa/awC+jcADw0yUbAVsAi4JTutnOAfYA9Jth2\nxTTVLEnSfUZfegjOB3YCXgdcDswBruluWwxsAWwwwbZlJDkiyYIkC6677ropeQCSJM1kfQkE7wNe\nWVXvoQWC5wHrdbfNpdV58wTbllFVx1fV/Kqav9lmm03NI5AkaQbrSyBYH9g1ySzgMcAHaN3/ALsB\nC4GLJ9gmSZImqS9zCN4PfBbYBrgQ+BhwXpJ5wKHAXkBNsE2SJE1SL3oIqur7VfWIqppbVQdV1WJg\nP+AiYP+qWjTRtuE8AkmSZra+9BAso6puZOkKgkm1SZKkyelFD4EkSRouA4EkSTIQSJIkA4EkScJA\nIEmSMBBIkiQMBJIkCQOBJEnCQCBJkjAQSJIkDASSJAkDgSRJwkAgSZIwEEiSJAwEkiQJA4EkScJA\nIEmSMBBIkiQMBJIkCQOBJEnCQCBJkjAQSJIkDASSJAkDgSRJwkAgSZIwEEiSJAwEkiQJA4EkScJA\nIEmSMBBIkiQMBJIkCQOBJEnCQCBJkjAQSJIkDASSJAkDgSRJwkAgSZIwEEiSJAwEkiQJA4EkScJA\nIEmSMBBIkiQMBJIkiZ4FgiQfT/KU7viEJBckOXLg9gm1SZKkyelNIEiyL/DAqvpGksOBWVW1NzAv\nyU4TbRviQ5AkacbqRSBIsjbwaWBhksOA/YBTupvPAfaZRJskSZqkXgQC4EXApcAHgUcDrwGu6W5b\nDGwBbDDBtmUkOSLJgiQLrrvuuil5AJIkzWR9CQR7AMdX1W+BzwPfBdbrbptLq/PmCbYto6qOr6r5\nVTV/s802m5pHIEnSDNaXQPBzYPvueD6wLUu7/3cDFgIXT7BNkiRN0uxhF9A5AfhMkucAa9PmBnw9\nyTzgUGAvoIDzJtAmSZImqRc9BFX1x6p6ZlU9vqoeW1VX0ULBRcD+VbWoqhZPpG04j0CSpJltpQJB\nkrckedY495mT5KSVKwuq6saqOqWbVzCpNkmSNDkr20Mwp/u3jCRbJVkLWAI8eWULkyRJ02fcOQRJ\nPkob17+r+3ojcBNwZ5KzgNtpY/lrA+8HzgC2qqo/JLl1qgqXJEmrz0R6CJ5Nm/X/yu7rM2gh4G7g\ncbRwcBDwAGAz4E5gZCz/7tVarSRJmhITCQSLquopA18zcNtNVfVC2n4An+/a7qqqJau5TkmSNIUm\nM4egpqwKSZI0VL1YdihJkoZrVTcm2ijJ52jbBr8AuBWY27WBgUOSpBlhdexUODJfoIB7RrVJkqQZ\nYFUDwU1V9dIkTwa+QLvy4C5V9VKAJNeuaoGSJGnq2aUvSZIMBJIkaWKB4P5JTgc26b5C24sgLDup\nEGCtJH25iqIkSZqAibxxn9zd71La9sQLgPW644toOxSe3bVd132dB/wKmLX6S5YkSavbuIGgqt4w\nui3Ju4C1q+qAMW47BLgxybq0ngNJktRzK9u1vz6th2AZVXUeQBcIvraS55ckSdNoZQPBuxlnr4Gq\nuh14/kqeX5IkTaMJrzJIsk6Sf0uyZVXdWlV3jHP/lyZ5zKqXKEmSptpkegjuAp4DXJnk58CvgSur\n6qrRd0wyD/gQ8H3gSaujUEmSNHUmHAiqqpJACwWbABsDleRG2mqD/wA+R1tZcBYtQLx8dRcsSZJW\nv5XZmOiJVfUAYF1gR+AVtCWJfwNcDVwMbNDd7zerq1BJkjR1VhgIkuyY5MgkDxl9W1XdWVW/rKqv\nAV8GLqStPtgS+HZVXTolFUuSpNVuvB6C/YF3AZcluZJ2RcP9khyY5PXdJMOrgfNovQIHAXsCT03y\ntqksXJIkrT4rnENQVZ9OciKwC/AE4ADgU8A63V3OA94MfKuqbhj5uSQvB05JckZVXTIllUuSpNVm\nInMI5lXVD4EfAj8CtqBNFvwB8Hjg4aPCQIArgNOA41Z7xZIkabUbbw7BNsAVSU4C9gMOrqrFwO+B\nxcDTgB8kuTbJR5M8CHgm7doG7wFOH/vMkiSpT1YYCLo9BnYD7qANL9yV5HnAV4DvVtXXgcuBjYB9\ngZ8DHwY+U1WXVNUHprJ4SZK0eozXQ7ALsBXwWeAeYEPalsUfAn6Y5AHdXW+uqkfRhhIeDGwzZRVL\nkqTVbryNif4SeM3A9wX8G5Du+DjufYnja2mXPd4vyauq6hOrsVZJkjRFxptU+GbacMAWwAu6tuuB\nQ4D7AX8AXgJslOT/0eYUfBN4MfDeJBtOQc2SJGk1G28OwW20pYb/DTyQtrLgONpljR9XVccAe9O2\nKX4c8HrgnKr6T+CnwKumrnRJkrS6jDeHYFPgBOCtwA3A7Kr6O+CDwJe7OQazgLuqah/gMNoKA2jX\nNXjFVBUuSZJWn/E2Jro+ycOq6rok+wBf7dqPTrIXsCltCeIVXftpAz9+OnDb1JQtSZJWp3GvdlhV\n13VfzwfOH7jpsKq6ozt+1Bg/dy3whdVRpCRJmlorc7VDAAbCgCRJmuFWOhBIkqT7DgOBJEkyEEiS\nJAOBJEnCQCBJkjAQSJIkDASSJAkDgSRJwkAgSZLoWSBIskWSH3XHJyS5IMmRA7dPqE2SJE1OrwIB\n8GFgvSSHA7Oqam9gXpKdJto2xNolSZqxehMIkhwA3AL8FtgPOKW76Rxgn0m0jXXuI5IsSLLguuuu\nm4LqJUma2ca92uF0SLIOcBTwNOBrwAbANd3Ni4EdJ9G2jKo6HjgeYP78+bX6H4Gk1ekjz37ylJz3\njSefNv6dpDVUX3oI3gr8U1Xd1H1/M7BedzyXVudE2yRJ0iT15Q30QOA1Sc4FdgeewtLu/92AhcDF\nE2yTJEmT1Ishg6p6/MhxFwqeCpyXZB5wKLAXUBNsk6Rpd/Vbz5uS8z74A/tOyXml0frSQ/AnVbVf\nVS2mTRi8CNi/qhZNtG04VUuSNLP1oodgLFV1I0tXEEyqTZIkTU7veggkSdL0MxBIkiQDgSRJMhBI\nkiQMBJIkCQOBJEnCQCBJkjAQSJIkDASSJAkDgSRJwkAgSZIwEEiSJAwEkiQJA4EkScJAIEmSMBBI\nkiQMBJIkCQOBJEnCQCBJkjAQSJIkDASSJAkDgSRJwkAgSZIwEEiSJAwEkiQJA4EkScJAIEmSMBBI\nkiQMBJIkCQOBJEnCQCBJkjAQSJIkDASSJAkDgSRJwkAgSZIwEEiSJAwEkiQJmD3sAiRJ0+/oo4+e\nkefW1LGHQJIkGQgkSZKBQJIkYSCQJEkYCCRJEj0JBEnun+SMJGcl+WqSdZKckOSCJEcO3G9CbZIk\naXL6suzw+cBHq+qsJJ8AngPMqqq9k3w8yU7ArhNpq6orhvg4JElT5D/P2WFKzvvEA66ckvPONL0I\nBFX18YFvNwNeABzbfX8OsA+wB3DKBNqWCQRJjgCOANh6661Xc/WSJM18vRgyGJHkscDGwK+Ba7rm\nxcAWwAYTbFtGVR1fVfOrav5mm202RdVLkjRz9SYQJNkE+AfgZcDNwHrdTXNpdU60TZIkTVIv3kCT\nrEPr+n9bVV0FXEzr/gfYDVg4iTZJkjRJvZhDALwceCTwjiTvAD4LvDDJPOBQYC+ggPMm0CZJkiap\nFz0EVfWJqtq4qvbr/p0I7AdcBOxfVYuqavFE2obzCCRJmtn60kOwjKq6kaUrCCbVJkmSJqcXPQSS\nJGm4DASSJMlAIEmSDASSJAkDgSRJwkAgSZIwEEiSJAwEkiQJA4EkScJAIEmSMBBIkiQMBJIkCQOB\nJEnCQCBJkjAQSJIkDASSJAkDgSRJwkAgSZIwEEiSJAwEkiQJA4EkScJAIEmSMBBIkiQMBJIkCQOB\nJEnCQCBJkjAQSJIkDASSJAkDgSRJwkAgSZKA2cMuQJKk+6oHfvuSKTnvb/fffbWf0x4CSZJkIJAk\nSQYCSZKEgUCSJGEgkCRJGAgkSRIGAkmShIFAkiRhIJAkSRgIJEkSBgJJksR9JBAkOSHJBUmOHHYt\nkiTNRDM+ECQ5HJhVVXsD85LsNOyaJEmaaWZ8IAD2A07pjs8B9hleKZIkzUypqmHXsEqSnAAcV1U/\nTvJ/gD2r6gOj7nMEcET37UOBn01BKZsC10/BeafSTKt5ptUL1jwdZlq9YM3TYabVC1NX8zZVtdl4\nd5o9Bb94ut0MrNcdz2WMXo+qOh44fiqLSLKgquZP5e9Y3WZazTOtXrDm6TDT6gVrng4zrV4Yfs33\nhSGDi1k6TLAbsHB4pUiSNDPdF3oIvgacl2QecCiw15DrkSRpxpnxPQRVtZg2sfAiYP+qWjSkUqZ0\nSGKKzLSaZ1q9YM3TYabVC9Y8HWZavTDkmmf8pEJJkrTqZnwPgSRJWnUGAkmSZCCQJEkGgimRZP9h\n1zCeNH/WHe+UZINh1zQRSXZM8qBh1zGeJDNiBc94dSY5eLpqWVVJnpXkL4Zdx2hJDh043iHJs4ZZ\nz6rq6995UJK1kjx32HVMVJKHJNkyyfZDrcNJhZOX5CTablL/AewAPJO2++EpwIOBB4zeLXHYkhwE\n3Nl9uw5wGfDFqto3yZdouz1+d2gFjiHJDsDzquqYgbbPA+dU1WeGV9nYkqwNfLmqnprk/KraJ8lf\n0oJ3AetU1XHDrfLeknyuql7U7fi5HvB94GtVtTDJq4Edq+oNw61yYpJsTfv7P2rYtQxK8h3gycAD\ngC8ApwN7ApvTnhdbV9W2Qytwknr8d54NvKOq3p1kLeDsqjpg2HUtT5IPA0uAzwHHAscArwKuAu4B\n1gbeVlV3T1dNM+JTTA9tA7wHOAk4EfgX4Czgy7RA8NChVbZ8nwQ+A7yMVutZwG1Jdgc27FsY6CwE\nNk/yiqr65yRvBu7uYxgAqKq7Bj5x39V9fTXwUeBNwIeGUtiK3dJ93Rp4NvAY4F1JHtjd9sxhFTaW\nJG8FnkfboTSjbwZ+O+1Fje8e4LPAAmBL4OG0N4Jjgdtpz41emaF/5yXA/sC7q+qeJPcMu6BxPA74\nO+AQYBbtvWMOLTB+AHjOdIYBMBCsrNuq6tIkzwCeArwWeAJwNHA18M/A04dX3piuqqq/S3IgLZE+\nCNiR9ib1wqFWthxVtQR4bZKTu27WC6rqJUMua6JGut5uqKoTk7ygqj431IrGdnWS+3fHOwOPBjYD\n/hM4HJgHXDOk2sYyC3hJVf1w2IVMRNfL8nBgO+CRtID1G1oAG3mO9LGbdkb9nQGqqkaFgL4HgtuB\na2k77S6h7adzLu3/2x1V9avpLshAsHIenuTrtKS8Fa0r/n7AX3Rtc4dY2/Js2V3kaR5wGHB/WpfU\nV4HDkszpU3d2V+tsYA9a6v9n4BdJXkR7Ab2gqq4cYon3kmRT4B+ABye5BLh/kgtpL6x9dj1wMLAx\n8DrgDODvq+qmJN8ETk6yb/VrbPFjSf4A3A3cBtwA/A/wjar6/VArG6WqPp6kgDOBv6d9ApxLe514\nFe2N4MHDq3CFZszfOcl5tDp3S3IO7e/7ZwPHs6tq32HWOIabgLfTegT2Ay4HfkLrff5Vki9W1bTO\ngzAQrJy9af+RBxXtibeElvr65i7af+i7gEW0ce31afXeQAsHfbI+LUEv6P7B0otYzaEN1fTpUteL\naLuM/W1VPWmksXtBgn5+CgS4gNbLdRLtObEd8E9JNgEuoY3J9qn2DwPHATvR5sHMBXaldcV/sZu7\n8a4h1jeW5wIvoPUe/oH2f+1O2nVYdqmqC4dY2/LMtL/zwbQegW/RuuDXAr4J/DktlPcxmP8H7fV3\nc9r/uxtovUffoYWbOdNdkKsMVs45wBsHvr4J+F53fCTwleGVtly/r6ovA9fRuqUuAn5HGzf+flV9\ncYi1LaOqjq2qT9L+Qx9Ie8P9dFV9quvJ+MhQCxylqu6qqm+PcdNOSc4G9kxyTt9WH1TVT4Czac+L\nU1k6Bnso8EPasEFvVNUdtHHtD1XVbcAJwFZV9a+050mvuri7yW1LaGFxC+BttKG6vwD+ip6+Bs+0\nvzPtTXV9YElV3VlVt3fHt1fVLd0W933zCtoHtMtowfxxwBNpPQevAX483QX16sVpBvl1Vb0uyXZV\n9TqAJLsMHD9juOWNaecknwEeRuum+iztE8traJMi+zobdyPafIzn0S5idWRVfbuqvjrkupZn8ySH\n0HoE7gaeBfy8qq5LMqubF9EbXVhZm3Yd9pGeow8ngfb6sP7wqltWkv8BFtO6hv+XrvZuNcdIvacO\nscTR/hbYhfbJ+vjueBPaPI2DaUHxbcBzq+qPQ6tylBn4d96RFrC2S7JXVV1EP3sFBt1Oew0e+T/2\nDVpv0shk059Pd0EuO1wJSU5j2QlBD6Il6v8BTqyqBWP97LAkeQRLhzlm0z4RnlJVT0jyUVovwUlD\nK3A5knyPdtGqO5NsDnwJOL6qPj/k0saU5D20/8z30LpZN6etStkBOLmq3jHE8saU5NG0QDjSY/Q9\n4MPdJ8Pe6T51nwe8A3gO8BDgY1X1jaEWthxJXkILtM8HPk/rWdwDOI22CuUvquqnQytwOWba3xna\nnirAx4AjaMuqnzDkkpYrySdpHxo+TZvc/WpaGFhIe1+ZBby+qu5a3jlWe00GgpWTZKequmJU21q0\nce0t+9YFP1rXdT2fNqnsDuCmPn1CGZHk8cD3Rj5ZJ9kY2KaqLhluZfeWZBZweFX9ezfBcKSruLrj\ntYCHVtV5QyxzTN18gft1ew+sRQsFP+vrDPOuxkOr6pvd91sAz6iqfxpuZcuXZDfgUmCHqro8yWFV\ndWqSRwJzquqCIZe4jMG/c/d68Rba60Tv/s5JdqZ1tW9Cq/GaJE+tqq93rxnrVNXvhlvl8nXP4ZuB\nh1TVj4ZVRy/Hr2aIE7rdsF490Bbgg8AVy/mZoep2SXsnQFXd3XWrHQ0c1Mcw0Ll0sJu9qm6sqku6\nN7E+KdqYIMC/09YR/5i2xPNU2gS9lw6ntLF1z9/XsXSlDHSPo8dhYHtg7ZEwANC90PdtUuyfJNkQ\neB/tTenyJOcCRya5AHh+T8PA3rQJhd/qhunuBvbtYxjo7ElbrXEWcEiS79PmSkH7f/mCIdU1IVX1\nu26uw48yxN1YDQQr75aqugd4YJL3dW0fB77et+GCAQvpNvoB6PtGP0nWow0RjPQMDDp7+itavu65\nsE2SN9CWdv6U9onl/bRQcHlVvWyIJS6jq/mptI20zu42o5lNv9dvP5f2HF43yTpJ5iRZFzho2IWt\nwBdoM/ZPSfJxWi/Bo4B9gYOSzBtqdWP7c2B3Wi/XyGqePncnF+15+zPauPzdwFe6D2wjwwi9M/gh\nbcDRwKFj3H3KGQhWXgFU1VHAnG4Szi+r6n0r/rHhqaolVfVa2ovQfwDr93Wjn7S90j/B0k+upyU5\nNsncJFsBvx5edct1E22J5B+BH9D2e+jziyjAPVV1Cu3Ff31gN/pf83HAt2ld8L8Bvgisk2S7JH3c\nJfQvq+pbwAa0iYWL4U8bbz2dfu7692NgXrfkdCQg9vl5cQRte+iibQB1Oy0E7Ebba2Wr4ZW2Qgvp\n0Yc0VxlMUtp+9c8FdkiyPu0T4PbAi4AlSQ6pqjOHWeNYZtJGP2lT3NelbbM88re8FfgUbXXEXbSw\n0Dfb0WaVb01bfnoe0Mc15oO2S/L27vgO2sz3bbu20Ma3jxpadQOSPAp4CW0Tl8/TJmMdAvyCNu/h\n/9E+5PRqlU9V/aY7vLUb7lo0cNv/Dqms8VwFvCnJdrQVSscNfC3gzKo6Y6gV3tuZwI9oz40ltI3i\nHgb8X1pofF+3+uCOoVU4hurZbqwGgsm7h5Y676btTPfvIwGgGyv8TJJLawjbTo5jxmz0U1WV5Kfc\nuwerquqybtz1DbQZ273RhZgfVdVTuqV8f0lbcrgL7W+/bpKte/i8uJXWzQpLN9e6rWsLQ9gcZQUW\n0AL4p2i1jvy7FPhhVfXyKoJJLqct6dype/7u0n2FdqGxDapq56EVOLbrac+DL9OubzHy9Uu01TMf\npe1q2RfX0GoesTMtJL6PtpHce+nZMsQ+fkgzEExSl+jemOThtK1I39MtNRtJ/RsC/0gbm+2NqjoW\nIG3L5Tto/8FP6caRSdKbLvhuTPhZwLuARyT5K2Buko/RZuJ+hNZL82/Dq3IZg5PazqL1FBTt08os\nWhfsO2kb0vTJ76rqy91yrQ/QNtka2cSqV7qguCnt7/hRll505/7dv16qqocBJDm9qp6U5MKq2nvk\n9m6uTN8spvVofCfJDd3XP1Tt7J1RAAAHqElEQVR3EbRuVU2fjGwUN7Kq52La5O7dgfNpT59bh1fe\nmHr3Ic1AsPKqqv4beFrXvbpFVb0eIMkGwy1thXq/0U+3y9grAZL8Jy35r0/bKe0Z3VDNSfQrEADc\n0U1+PI62bvsk2vrif6yqy4Za2fKtleQptDBwZFX9Mm3v/b5amzZjfKR3ANp1GLZK8l3gmX1cXtb1\nII0EmHv9fXu638NttBA+i9aLAQN193Avgg/RrgWwO20L4LVoc3ouoe0R80J6toNsHz+kOalw5d1v\n5GBkImHaBiRU1S3L+Zk+mAUsqqq/p42vvTtJ75bkJPlI2vXCNwdOpi0r+kmSf6el6krSp+7sJwKX\nVdWNtOuab0cbVroA+EKSbySZP8wCR+vWmX+Dtjzr0VX11e6Nq2+f/oA/ba51IO1FfgPasMwetFnk\n/wMcRVvp00ezWPqGtE73t++zu2irYwCO6+rt7dJOWtiaBTyCNv/oBtr27FfStgReK8n9lv/jQ7UR\nbVOiLWgf0vYHGMaHNDcmWklJ9hxcq911++3U9Rr0VmbIRj8ASfagdWEfMRKykryWtuvfG0aSdN+k\n7fx32cjeDt2b7EuB06uqjzPK/6Sr9cCqOmvYtYzWvaAfSxs2egttPk9oH2y+1HXHH1tVfz3EMseV\nZO8+7j2wIl0gOKSqTh92LWNJ25/ij7Qh25tr4GqMaZsqbdiF9d5Jj3ZjNRCo97puy0Or6rTu+6OA\nBX19cdLUSnJoVZ2R5OVVdULX9spqF8OSZpTuQ9oFtL00bqDNfdgR+NV0D3/1vdtKa7AkI7vRFW2d\n8Yiv0iZ0ag00sNztmQNthgHNSFX13W4nyBuAU4C9aHsoPGq6a3FSofpsCbQd9ZJskWTkUrxr0cY4\ntWbr5ZCRNFFJLqTtk3AnbeL0S2lDHwcPYy6agUC9lOQHwPbdeu2RCUNzWTpTu2/L9zQNuvHWO2jP\ng12TnNMdj+yhsG5VPXaIJUoTkuRV3eHHaVca/ShtwmwBr0zym6qa1pVUBgL11aOBU6vqqfCnzV0e\nQNsw57tVdfMwi9NwVNXjRo6TnFFVQ9nzXVoNRiY5zgP+lfbati0t8F5Ldx2X6eQcAvXVUcAeSQ7u\nZjhfR/sPMx+4OMnTh1qd+uBPyyOTbDPMQqTJqqqTaLsTfg14K20zto/QNiU6H9h1umsyEKivjgGe\nRpt5Owe4sapOr6r30CbbvDntGuJawyTZsTsc3BzngyMXiJFmkOfQhrr27JZKHk7bUOlDtP0UppXL\nDtVLSXYBXl1Vr+4uarOENodgD9oOhfdU1Q3DrFHTL8k+tE9RT6Y9D2bTxlxn0TageXFVfWF4FUoT\nk+RU2oZr5wOPBb5He417IXDLyF4x08keAvXVYbQrlG1IG19bi6UX3DmDtvvf3CHWp+F4DG1Piuto\noXD/qjqgqp5A2/GtN1twSytSVYcBl1fVc2nXP3lz9/VQ4Pwk0349HHsI1GtJNgIeO/pSq6N3itSa\nJ8nruy24pRkpya5V9ZNuW+6rRiZLd7ty7llV505rPQYCSZLkkIEkSTIQSJIkA4GkSUryoFHfH5jk\n7aPaXtzN/5A0QxgIJE1YkvsDlyQ5dqD5McBrBu5zAPBZ4J3LOcfdSWoS/x4+pQ9KEuDWxZImoaoW\nJXk1cFKSdWhBYAndxaaSPAQ4GfgybRnVWG4DPgX84zi/7hm0DVruXg2lSxqHgUDSuJLMpu0YeWdV\nfTnJX9M2Asqouz4K+F/a5iqVZF2Aqrp94D5LgJuqauE4v/P67tBAIE0DA4GkiTiQtiEUyb0ywHNH\nDpIMrmG+beD4ROAlA99P9rLFBgJpGhgIJE3ERbRtoxcBN9O2C94BOJN2Lfc7aNuwfph2lbaLaHOU\n1gNuH3Wue4CNkmw7zu/cdOD+kqaYGxNJmrQku9Ou0vYL4NvAy2lXZzsbeCRt7P9dVXXnGD97Pe1S\n1hO1VVVdvcpFS1ohVxlImrAkc5K8GbgQuJR2kaEC1qmqP9LmFbwf+Fvgwm6S4VjeWVVZ0T/gpdPw\nkCR1DASSxpVk/STH0CYMvpu2pPDJVXUr7TKt6wBU1d1V9U7gKcBOwMVJthpS2ZImwTkEksZVVbd2\n+wGcCxxVVVcN3PxfwMFJtq+qX3T3PyPJ3sCBVfXrMU45mTkEkqaBcwgkjSvJPGAL2uTB0ZP8ng0c\nBczn3qsLQus5mF1VFw+cyzkEUg/ZQyBpIl4BvIW2omDJqNs2og0/fpcWGEasRdu7oIC5o37mnVX1\n3hX9wiQvoe14KGkaGAgkjauq3gO8Z3R7ku2By4GrgMXAviPXdJc0szipUNJK6S5edDKwgLYHAcBp\nSbYcXlWSVpaBQNKkJJmd5Om0ILA+8PSqWkTbzfBu4KdJ3phk4+WcYhZwzHgXNWLpcMHo7ZElTQEn\nFUoaV5KtgScAewFPBzYBPgG8vapuGbjfLNpFjd5GG5L8JnBmVZ0wcJ+bgU8y8Ysb7TCyekHS1HEO\ngaSJ2J52TYKf0d7MP11V14y+U1UtAd6f5DPA64AX03oSBt0O/HYSFzdaZ9VKlzQR9hBImpAkm1fV\n7yf5M2sBs6rqrikqS9JqYiCQJElOKpQkSQYCSZKEgUCSJGEgkCRJGAgkSRLw/wHqTZLz8VfITgAA\nAABJRU5ErkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7ff270b9d668>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "d = {'cat':df['cat'].value_counts().index, 'count': df['cat'].value_counts()}\n",
    "df_cat = pd.DataFrame(data=d).reset_index(drop=True)\n",
    "print(df_cat)\n",
    "\n",
    "df_cat.plot(x='cat', y='count', kind='bar', legend=False,  figsize=(8, 5))\n",
    "plt.title(\"类目数量分布\")\n",
    "plt.ylabel('数量', fontsize=18)\n",
    "plt.xlabel('类目', fontsize=18)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 数据清洗"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "#定义删除除字母,数字，汉字以外的所有符号的函数\n",
    "def remove_punctuation(line):\n",
    "    line = str(line)\n",
    "    if line.strip()=='':\n",
    "        return ''\n",
    "    rule = re.compile(u\"[^a-zA-Z0-9\\u4E00-\\u9FA5]\")\n",
    "    line = rule.sub('',line)\n",
    "    return line\n",
    "\n",
    "#停用词列表\n",
    "def stopwordslist(filepath):  \n",
    "    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]  \n",
    "    return stopwords  \n",
    "\n",
    "#加载停用词\n",
    "stopwords = stopwordslist(\"./data/chineseStopWords.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache /tmp/jieba.cache\n",
      "Loading model cost 0.898 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cat</th>\n",
       "      <th>review</th>\n",
       "      <th>clean_review</th>\n",
       "      <th>cut_review</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>书籍</td>\n",
       "      <td>﻿做父母一定要有刘墉这样的心态，不断地学习，不断地进步，不断地给自己补充新鲜血液，让自己保持...</td>\n",
       "      <td>做父母一定要有刘墉这样的心态不断地学习不断地进步不断地给自己补充新鲜血液让自己保持一颗年轻的...</td>\n",
       "      <td>[做, 父母, 一定, 刘墉, 心态, 不断, 学习, 不断, 进步, 不断, 补充, 新鲜...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>书籍</td>\n",
       "      <td>作者真有英国人严谨的风格，提出观点、进行论述论证，尽管本人对物理学了解不深，但是仍然能感受到...</td>\n",
       "      <td>作者真有英国人严谨的风格提出观点进行论述论证尽管本人对物理学了解不深但是仍然能感受到真理的火...</td>\n",
       "      <td>[作者, 真有, 英国人, 严谨, 风格, 提出, 观点, 进行, 论述, 论证, 物理学,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>书籍</td>\n",
       "      <td>作者长篇大论借用详细报告数据处理工作和计算结果支持其新观点。为什么荷兰曾经县有欧洲最高的生产...</td>\n",
       "      <td>作者长篇大论借用详细报告数据处理工作和计算结果支持其新观点为什么荷兰曾经县有欧洲最高的生产率...</td>\n",
       "      <td>[作者, 长篇大论, 借用, 详细, 报告, 数据处理, 工作, 计算结果, 支持, 其新,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>书籍</td>\n",
       "      <td>作者在战几时之前用了＂拥抱＂令人叫绝．日本如果没有战败，就有会有美军的占领，没胡官僚主义的延...</td>\n",
       "      <td>作者在战几时之前用了拥抱令人叫绝日本如果没有战败就有会有美军的占领没胡官僚主义的延续没有战后...</td>\n",
       "      <td>[作者, 战, 之前, 拥抱, 令人, 叫绝, 日本, 没有, 战败, 会, 美军, 占领,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>书籍</td>\n",
       "      <td>作者在少年时即喜阅读，能看出他精读了无数经典，因而他有一个庞大的内心世界。他的作品最难能可贵...</td>\n",
       "      <td>作者在少年时即喜阅读能看出他精读了无数经典因而他有一个庞大的内心世界他的作品最难能可贵的有两...</td>\n",
       "      <td>[作者, 少年, 时即, 喜, 阅读, 看出, 精读, 无数, 经典, 一个, 庞大, 内心...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  cat                                             review  \\\n",
       "0  书籍  ﻿做父母一定要有刘墉这样的心态，不断地学习，不断地进步，不断地给自己补充新鲜血液，让自己保持...   \n",
       "1  书籍  作者真有英国人严谨的风格，提出观点、进行论述论证，尽管本人对物理学了解不深，但是仍然能感受到...   \n",
       "2  书籍  作者长篇大论借用详细报告数据处理工作和计算结果支持其新观点。为什么荷兰曾经县有欧洲最高的生产...   \n",
       "3  书籍  作者在战几时之前用了＂拥抱＂令人叫绝．日本如果没有战败，就有会有美军的占领，没胡官僚主义的延...   \n",
       "4  书籍  作者在少年时即喜阅读，能看出他精读了无数经典，因而他有一个庞大的内心世界。他的作品最难能可贵...   \n",
       "\n",
       "                                        clean_review  \\\n",
       "0  做父母一定要有刘墉这样的心态不断地学习不断地进步不断地给自己补充新鲜血液让自己保持一颗年轻的...   \n",
       "1  作者真有英国人严谨的风格提出观点进行论述论证尽管本人对物理学了解不深但是仍然能感受到真理的火...   \n",
       "2  作者长篇大论借用详细报告数据处理工作和计算结果支持其新观点为什么荷兰曾经县有欧洲最高的生产率...   \n",
       "3  作者在战几时之前用了拥抱令人叫绝日本如果没有战败就有会有美军的占领没胡官僚主义的延续没有战后...   \n",
       "4  作者在少年时即喜阅读能看出他精读了无数经典因而他有一个庞大的内心世界他的作品最难能可贵的有两...   \n",
       "\n",
       "                                          cut_review  \n",
       "0  [做, 父母, 一定, 刘墉, 心态, 不断, 学习, 不断, 进步, 不断, 补充, 新鲜...  \n",
       "1  [作者, 真有, 英国人, 严谨, 风格, 提出, 观点, 进行, 论述, 论证, 物理学,...  \n",
       "2  [作者, 长篇大论, 借用, 详细, 报告, 数据处理, 工作, 计算结果, 支持, 其新,...  \n",
       "3  [作者, 战, 之前, 拥抱, 令人, 叫绝, 日本, 没有, 战败, 会, 美军, 占领,...  \n",
       "4  [作者, 少年, 时即, 喜, 阅读, 看出, 精读, 无数, 经典, 一个, 庞大, 内心...  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#删除除字母,数字，汉字以外的所有符号\n",
    "df['clean_review'] = df['review'].apply(remove_punctuation)\n",
    "\n",
    "#分词，并过滤停用词\n",
    "# df['cut_review'] = df['clean_review'].apply(lambda x: \" \".join([w for w in list(jb.cut(x)) if w not in stopwords]))\n",
    "df['cut_review'] = df['clean_review'].apply(lambda x: [w for w in list(jb.cut(x)) if w not in stopwords])\n",
    "df.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['作者', '力', '马克思', '注意', '经济学', '角度', '剖析', '当代', '中国', '经济', '细心', '人会', '发现', '中国', '近', '20', '年', '政策措施', '何新', '先生', '文章', '里', '找到', '蛛丝马迹', '作者', '早', '2001', '年', '预言', '十年', '世界', '会', '爆发', '大规模', '金融危机', '非常', '难得', '原因', '作者', '数据', '统计', '严密', '逻辑推理', '国家', '发展', '大计', '重任', '知识分子', '现在', '不多见', '看看', '上市公司', '独立', '懂事', '知道', '认为', '何先生', '了不起', '知识分子']\n",
      "cat: 书籍\n",
      "['喜欢', '张爱玲', '拥有', '两套', '张爱玲', '全集', '新出', '书', '买', '当当', '实惠', '爱到', '好像', '本书', '当做', '广告', '熟悉', '张爱玲', '喜欢', '文字', '劝', '不要', '买', '本书', '沿袭', '张', '风范', '有点', '晦涩', '难懂', '需要', '用心', '仔细阅读', '喜欢', '张', '一定', '读', '下去', '值得', '表扬', '出版商', '封面设计', '真不错', '个人', '非常', '喜欢', '装帧', '考究', '觉得', '超值']\n",
      "cat: 书籍\n"
     ]
    }
   ],
   "source": [
    "def print_cut_review(index):\n",
    "    example = df[df.index == index][['cut_review', 'cat']].values[0]\n",
    "    if len(example) > 0:\n",
    "        print(example[0])\n",
    "        print('cat:', example[1])\n",
    "        \n",
    "print_cut_review(10)\n",
    "print_cut_review(500)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 创建训练集和测试集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "TaggedDocument(words=['平心而论', '价位', '房间', '应该', '算', '不错', '没有', '碰到', '下面', '几件', '事情', '拿到', '房卡', '房间', '居然', '没有', '打扫', '房门', '锁', '发生', '两次', '故障', '锁', '外面', '不用', '钥匙', '打开', '服务员', '很快', '修好', '总是', '怕怕', '宾馆', '主楼', '一道', '边门', '直接', '通往', '公园', '通道', '居然', '门大开', '没有', '门卫', '安保', '措施', '这道', '门', '直接', '二楼', '客房', '走廊', '走廊', '探头', '安全措施', '总让', '放心', '个人感觉', '餐厅', '菜', '太贵', '一盘', '糖醋排骨', '68', '元', '没敢', '点', '要求', '不高', '早上', '自助餐'], tags=['酒店'])"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#创建训练集和测试集\n",
    "train, test = train_test_split(df, test_size=0.3, random_state=42,stratify = df.cat.values)\n",
    "\n",
    "#创建标签化文档\n",
    "train_tagged = train.apply(\n",
    "    lambda r: TaggedDocument(words=r['cut_review'], tags=[r['cat']]), axis=1)\n",
    "test_tagged = test.apply(\n",
    "    lambda r: TaggedDocument(words=r['cut_review'], tags=[r['cat']]), axis=1)\n",
    "\n",
    "train_tagged.values[30]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "#CPU内核数\n",
    "cores = multiprocessing.cpu_count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 训练DBOW模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 43941/43941 [00:00<00:00, 2494442.88it/s]\n"
     ]
    }
   ],
   "source": [
    "from gensim.models import Doc2Vec\n",
    "from tqdm import tqdm\n",
    "\n",
    "model_dbow = Doc2Vec(dm=0,  negative=5, hs=0, min_count=2, sample = 0, workers=cores)\n",
    "model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 43941/43941 [00:00<00:00, 1764448.24it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2868512.25it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2949349.68it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2926727.94it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3037676.56it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2978568.62it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2916031.08it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2966057.45it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3016941.05it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2992594.29it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2998778.24it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3024466.45it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2929612.34it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3025707.78it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2901204.42it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2952231.56it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3027248.44it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2873834.99it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2884810.87it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3009207.33it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3058902.13it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3006899.84it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3034125.94it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2959057.09it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2911424.61it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2899834.98it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2900656.49it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2886753.84it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3018373.93it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3045456.85it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1min 21s, sys: 20.4 s, total: 1min 41s\n",
      "Wall time: 1min 15s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "for epoch in range(30):\n",
    "    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)\n",
    "    model_dbow.alpha -= 0.002\n",
    "    model_dbow.min_alpha = model_dbow.alpha"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 生成文档特征向量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "def vec_for_learning(model, tagged_docs):\n",
    "    sents = tagged_docs.values\n",
    "    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])\n",
    "    return targets, regressors\n",
    "\n",
    "#生成特征向量\n",
    "y_train, X_train = vec_for_learning(model_dbow, train_tagged)\n",
    "y_test, X_test = vec_for_learning(model_dbow, test_tagged)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 预测和评估"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "#使用逻辑回归来预测\n",
    "logreg = LogisticRegression(n_jobs=1, C=1e5)\n",
    "logreg.fit(X_train, y_train)\n",
    "y_pred = logreg.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Testing accuracy 0.7770284621920136\n",
      "Testing F1 score: 0.7772072575572019\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import accuracy_score, f1_score\n",
    "\n",
    "print('Testing accuracy %s' % accuracy_score(y_test, y_pred))\n",
    "print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### DM模型创建和训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 43941/43941 [00:00<00:00, 2471263.81it/s]\n"
     ]
    }
   ],
   "source": [
    "model_dmm = Doc2Vec(dm=1, dm_mean=1, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)\n",
    "model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 43941/43941 [00:00<00:00, 1347226.35it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3097042.67it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3020253.55it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3020253.55it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3061035.93it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2976163.68it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3012896.83it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3012305.90it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3025012.51it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3015706.91it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2960673.29it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2999803.25it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2957490.12it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2913910.29it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3011715.21it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3018917.79it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3017237.40it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3022928.62it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2988082.04it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2952468.03it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2929798.62it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3049084.49it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3006409.34it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3057734.88it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3040131.83it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 2951616.92it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3033526.66it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3017286.80it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3003567.61it/s]\n",
      "100%|██████████| 43941/43941 [00:00<00:00, 3057227.66it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2min 36s, sys: 1min 58s, total: 4min 34s\n",
      "Wall time: 3min 9s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "for epoch in range(30):\n",
    "    model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)\n",
    "    model_dmm.alpha -= 0.002\n",
    "    model_dmm.min_alpha = model_dmm.alpha"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Testing accuracy 0.33804163126593034\n",
      "Testing F1 score: 0.3107571794952185\n"
     ]
    }
   ],
   "source": [
    "y_train, X_train = vec_for_learning(model_dmm, train_tagged)\n",
    "y_test, X_test = vec_for_learning(model_dmm, test_tagged)\n",
    "\n",
    "logreg.fit(X_train, y_train)\n",
    "y_pred = logreg.predict(X_test)\n",
    "\n",
    "print('Testing accuracy %s' % accuracy_score(y_test, y_pred))\n",
    "print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 合成新模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)\n",
    "model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim.test.test_doc2vec import ConcatenatedDoc2Vec\n",
    "new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Testing accuracy 0.7891886151231946\n",
      "Testing F1 score: 0.7899100861776134\n"
     ]
    }
   ],
   "source": [
    "def get_vectors(model, tagged_docs):\n",
    "    sents = tagged_docs.values\n",
    "    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])\n",
    "    return targets, regressors\n",
    "\n",
    "y_train, X_train = get_vectors(new_model, train_tagged)\n",
    "y_test, X_test = get_vectors(new_model, test_tagged)\n",
    "\n",
    "logreg.fit(X_train, y_train)\n",
    "y_pred = logreg.predict(X_test)\n",
    "\n",
    "print('Testing accuracy %s' % accuracy_score(y_test, y_pred))\n",
    "print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
